DO Methylation

This is the first pass analysis of methylation from 191 DO samples. To start with, I am just considering filered methylation data Anuj gave for chromosome 1. Let us first load the data sets for chr 1.

methyl_sites = read.table("methyl_chr1_sites.txt")
colnames(methyl_sites)=c("Chr","Start","End")
head(methyl_sites)
##   Chr   Start     End
## 1   1 3020793 3020794
## 2   1 3020794 3020795
## 3   1 3020813 3020814
## 4   1 3020814 3020815
## 5   1 3020836 3020837
## 6   1 3020837 3020838
methyl_counts=read.table("methyl_chr1_methylated.txt")
dim(methyl_counts)
## [1] 31716   191
methyl_counts[1:5,1:6]
##   V1 V2 V3 V4 V5 V6
## 1  7 33 20 15  4  3
## 2  8  8 21 28  5  4
## 3  5 33 21 22  4  3
## 4  7 14 18 26  6 11
## 5  6 29 19 19  3  3
total_counts=read.table("methyl_chr1_total.txt")
dim(total_counts)
## [1] 31716   191

The data is a big matrix with 31716 methylation sites in 191 DO animals. Let us find the mean methylation ratio for each methylation site across all samples.

mean_methyl=apply(methyl_counts,1,mean)
total= apply(total_counts,1,mean)
mean_methyl_df = data.frame(mean_methyl=mean_methyl,
                            total=total,
                            ave=mean_methyl/total)
#head(mean_methyl_df)
mean_methyl_df %>% ggplot(aes(ave))+geom_histogram(bins=100)

mean_methyl_df %>% ggplot(aes(mean_methyl))+geom_histogram(bins=100)+xlim(c(0,75))+ggtitle("Mean Methylation")
## Warning: Removed 45 rows containing non-finite values (stat_bin).

mean_methyl_df %>% ggplot(aes(total))+geom_histogram(bins=100)+xlim(c(0,75))+ggtitle("Mean total read counts")
## Warning: Removed 79 rows containing non-finite values (stat_bin).

methyl_prop <- methyl_counts/total_counts
print(dim(methyl_prop))
## [1] 31716   191
num_ones <- apply(methyl_prop,1,function(x){sum(x>=0.9)})
num_nas <- apply(methyl_prop,1,function(x){sum(is.na(x))})
hist(num_ones, breaks=100,col="grey")

hist(num_nas, breaks=100,col="grey")

total_methyl=apply(methyl_counts,1,sum)
total2= apply(total_counts,1,sum)
mean_methyl_df2 = data.frame(methyl=total_methyl,
                            total=total2,
                            ave=mean_methyl/total)
#head(mean_methyl_df2)
#mean_methyl_df2 %>% ggplot(aes(methyl))+geom_histogram(bins=100)+xlim(c(0,5000))
#mean_methyl_df2 %>% ggplot(aes(total))+geom_histogram(bins=100)+xlim(c(0,10000))

We are interested in estimating the proportion of methylation at each methylation site. For each site, we have 191 DO animals. We can use Beta-Binomial model to estimate the proprtion of methylation accounting for both technical and biological variations. We will take Empirical Bayes approach to the hiearchical model \[M \sim binom(N,p)\], where p is Beta distributed with parameters \(\alpha\) and \(\beta\), \[p \sim Beta(\alpha,\beta)\].

Let us fit Beta-binomial model for each methylation site using two approaches.

We will look at handfull of methylation sites from chromosome 1 and first fit data from each methylation site from all samples with Methods of Moments estimates and MLE estimates

db_ll <- function(alpha, beta) {
      -sum(VGAM::dbetabinom.ab(x, total, alpha, beta,
                             log = TRUE))
  }
for (i in 1:25){
  total = as.numeric(total_counts[i,])
  x= as.numeric(methyl_counts[i,])
  ### Method of moments estimates
  mu <- mean(x/total,na.rm=TRUE)
  sigma2 <- var(x/total,na.rm=TRUE)
  alpha_mm <- ((1 - mu)/sigma2 - 1/mu) * mu^2
  beta_mm <- alpha_mm * (1/mu - 1)
  print(alpha_mm)
  print(beta_mm)
  ll <- function(alpha, beta) {
    -sum(VGAM::dbetabinom.ab(x, total, alpha, beta, log = TRUE))
  }
  m <- mle(ll, start = list(alpha = 1, beta = 10), method = "L-BFGS-B", lower = c(0.0001, .1))
  ab     <- coef(m)
  alpha0 <- ab[1]
  beta0  <- ab[2]
  df <- data.frame(methylated=x,total=total,prop=x/total)
  ggplot(df,aes(x=prop))+geom_histogram()
  ### estimate beta parameters
  #m <- MASS::fitdistr(df$prop[-na_ind], dbeta, start = list(shape1 = 1, shape2 = 2))
  #alpha_m <- m$estimate[1]
  #beta_m <- m$estimate[2]
  t_text= paste0("MoM: alpha =",signif(alpha_mm,3),", beta = ",signif(beta_mm,3),
                 " MLE: alpha =",signif(alpha0,3),", beta = ",signif(beta0,3))
  p <- ggplot(df,aes(x=prop)) + geom_histogram(aes(y = ..density..), fill ="blue") 
  p <- p + stat_function(fun = function(x) dbeta(x, alpha_mm, beta_mm),  color = "red", size = 1) 
  p <- p + stat_function(fun = function(x) dbeta(x, alpha0, beta0),  color = "green", size = 1) 
  #p <- p + stat_function(fun = function(x) dbeta(x, alpha_m, beta_m),  color = "pink", size = 1) 
  p <- p + labs(x = "methylation rate") + ggtitle(t_text)
  print(p)
  df_eb <- df %>% mutate(eb_estimate = (methylated+alpha_mm)/(total+alpha_mm+beta_mm))
  print(head(df_eb))
  p <- ggplot(df_eb,aes(x=prop,y=eb_estimate))+geom_point(size=3,alpha=0.5)
  #p<-p+ geom_text_repel(aes(prop,eb_estimate,label = total), color = "black")
  print(p)
}
## [1] 1.981844
## [1] 0.3863413
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 31 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1          7     7 1.0000000   0.9587603
## 2         33    34 0.9705882   0.9618804
## 3         20    22 0.9090909   0.9020714
## 4         15    23 0.6521739   0.6694150
## 5          4     4 1.0000000   0.9393326
## 6          3     4 0.7500000   0.7823020
## Warning: Removed 31 rows containing missing values (geom_point).

## [1] 3.951775
## [1] 0.9793277
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 35 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1          8    13 0.6153846   0.6665388
## 2          8    11 0.7272727   0.7502164
## 3         21    23 0.9130435   0.8933330
## 4         28    31 0.9032258   0.8892512
## 5          5     6 0.8333333   0.8189270
## 6          4     5 0.8000000   0.8006941
## Warning: Removed 35 rows containing missing values (geom_point).

## [1] 3.326986
## [1] 0.2619668
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 30 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1          5     7 0.7142857   0.7863843
## 2         33    34 0.9705882   0.9664272
## 3         21    22 0.9545455   0.9506831
## 4         22    23 0.9565217   0.9525379
## 5          4     4 1.0000000   0.9654805
## 6          3     4 0.7500000   0.8337100
## Warning: Removed 30 rows containing missing values (geom_point).

## [1] 1.206845
## [1] 0.1431226
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1          7     8 0.8750000   0.8777405
## 2         14    16 0.8750000   0.8764769
## 3         18    19 0.9473684   0.9438268
## 4         26    26 1.0000000   0.9947670
## 5          6     7 0.8571429   0.8630986
## 6         11    11 1.0000000   0.9884111
## Warning: Removed 14 rows containing missing values (geom_point).

## [1] 7.613874
## [1] 0.5476229
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 40 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1          6     6 1.0000000   0.9613302
## 2         29    31 0.9354839   0.9349457
## 3         19    22 0.8636364   0.8823791
## 4         19    23 0.8260870   0.8540628
## 5          3     3 1.0000000   0.9509364
## 6          3     4 0.7500000   0.8727440
## Warning: Removed 40 rows containing missing values (geom_point).

## [1] 1.821245
## [1] 0.3355423
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 13 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1          7     8 0.8750000   0.8685074
## 2         15    16 0.9375000   0.9264439
## 3         16    20 0.8000000   0.8043244
## 4         22    26 0.8461538   0.8460214
## 5          7     8 0.8750000   0.8685074
## 6         11    14 0.7857143   0.7935516
## Warning: Removed 13 rows containing missing values (geom_point).

## [1] 12.44548
## [1] 1.118628
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1         13    15 0.8666667   0.8908201
## 2         24    25 0.9600000   0.9450622
## 3         15    16 0.9375000   0.9283378
## 4         12    13 0.9230769   0.9202447
## 5         20    21 0.9523810   0.9387044
## 6         12    15 0.8000000   0.8558111

## [1] 5.536419
## [1] 0.5672701
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (stat_bin).

##   methylated total prop eb_estimate
## 1          7    10  0.7   0.7784812
## 2         19    19  1.0   0.9774029
## 3          7     7  1.0   0.9567091
## 4          0     0  NaN   0.9070611
## 5         21    21  1.0   0.9790704
## 6          7     7  1.0   0.9567091
## Warning: Removed 15 rows containing missing values (geom_point).

## [1] 6.551791
## [1] 1.378033
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1         10    10 1.0000000   0.9231430
## 2         15    19 0.7894737   0.8002945
## 3          7     7 1.0000000   0.9076993
## 4          0     0       NaN   0.8262215
## 5         19    21 0.9047619   0.8832335
## 6          5     7 0.7142857   0.7737393
## Warning: Removed 15 rows containing missing values (geom_point).

## [1] 11.68014
## [1] 4.133795
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1         13    20 0.6500000   0.6891211
## 2         17    20 0.8500000   0.8008095
## 3         22    26 0.8461538   0.8054764
## 4         22    27 0.8148148   0.7866630
## 5         11    14 0.7857143   0.7607228
## 6         16    18 0.8888889   0.8186015

## [1] 8.964551
## [1] 0.5833216
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 17 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1          8     9 0.8888889   0.9146359
## 2          9     9 1.0000000   0.9685505
## 3         10    10 1.0000000   0.9701593
## 4          0     0       NaN   0.9389056
## 5         13    14 0.9285714   0.9327616
## 6          9    10 0.9000000   0.9190029
## Warning: Removed 17 rows containing missing values (geom_point).

## [1] 1.332464
## [1] 0.1290579
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 35 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1         11    11 1.0000000   0.9896435
## 2          9    12 0.7500000   0.7675554
## 3         25    26 0.9615385   0.9588858
## 4         26    27 0.9629630   0.9603304
## 5          5     5 1.0000000   0.9800267
## 6          7     8 0.8750000   0.8806685
## Warning: Removed 35 rows containing missing values (geom_point).

## [1] 13.42128
## [1] 1.523929
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1         15    18 0.8333333   0.8626833
## 2         20    22 0.9090909   0.9046174
## 3          7     7 1.0000000   0.9305576
## 4         16    18 0.8888889   0.8930367
## 5          6     6 1.0000000   0.9272421
## 6         11    11 1.0000000   0.9412636

## [1] 0.9614116
## [1] 0.4055804
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1          7    11 0.6363636   0.6437630
## 2          9    11 0.8181818   0.8054838
## 3          0     1 0.0000000   0.4061744
## 4          0     0       NaN   0.7033045
## 5          6     6 1.0000000   0.9449463
## 6          3     4 0.7500000   0.7381065
## Warning: Removed 22 rows containing missing values (geom_point).

## [1] 8.220248
## [1] 2.120307
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1         11    18 0.6111111   0.6781888
## 2         19    22 0.8636364   0.8416753
## 3          5     7 0.7142857   0.7623890
## 4         15    18 0.8333333   0.8193293
## 5          6     6 1.0000000   0.8702426
## 6         10    11 0.9090909   0.8537851

## [1] 1.013136
## [1] 0.3443206
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1          7    18 0.3888889   0.4139560
## 2         21    22 0.9545455   0.9424458
## 3          7     7 1.0000000   0.9588008
## 4         16    18 0.8888889   0.8788931
## 5          5     6 0.8333333   0.8172846
## 6          6    11 0.5454545   0.5675226

## [1] 12.63829
## [1] 1.296312
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1         15    18 0.8333333   0.8654653
## 2         19    22 0.8636364   0.8804408
## 3          6     7 0.8571429   0.8903102
## 4         16    18 0.8888889   0.8967793
## 5          4     6 0.6666667   0.8346437
## 6         11    11 1.0000000   0.9480115

## [1] 5.418745
## [1] 2.3931
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1         16    21 0.7619048   0.7434007
## 2         10    15 0.6666667   0.6759096
## 3         11    16 0.6875000   0.6895201
## 4         23    28 0.8214286   0.7935571
## 5         15    17 0.8823529   0.8229435
## 6         11    12 0.9166667   0.8287338

## [1] 13.38122
## [1] 3.995638
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1         14    17 0.8235294   0.7965015
## 2         24    32 0.7500000   0.7570595
## 3         15    19 0.7894737   0.7801999
## 4         21    33 0.6363636   0.6824804
## 5         13    17 0.7647059   0.7674122
## 6         15    17 0.8823529   0.8255909

## [1] 7.520814
## [1] 0.7701076
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1          7     7 1.0000000   0.9496363
## 2         13    14 0.9285714   0.9205906
## 3          2     2 1.0000000   0.9251663
## 4          8     9 0.8888889   0.8976279
## 5          2     4 0.5000000   0.7746217
## 6         11    13 0.8461538   0.8698925

## [1] 7.78895
## [1] 0.5444666
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1          7     7 1.0000000   0.9644915
## 2         14    14 1.0000000   0.9756210
## 3          2     2 1.0000000   0.9473101
## 4          9    10 0.9000000   0.9157567
## 5          4     4 1.0000000   0.9558544
## 6         10    13 0.7692308   0.8338538

## [1] 6.810422
## [1] 0.8585759
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1          6     7 0.8571429   0.8732991
## 2         10    14 0.7142857   0.7757822
## 3         10    12 0.8333333   0.8546659
## 4          8     9 0.8888889   0.8885010
## 5         10    10 1.0000000   0.9514078
## 6         13    13 1.0000000   0.9584607

## [1] 14.75594
## [1] 3.03346
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##   methylated total      prop eb_estimate
## 1         20    24 0.8333333   0.8316927
## 2         11    12 0.9166667   0.8646009
## 3         17    18 0.9444444   0.8873002
## 4         31    37 0.8378378   0.8351240
## 5         14    16 0.8750000   0.8510344
## 6         14    17 0.8235294   0.8265719

## [1] 3.078589
## [1] 0.320167
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1          5     6 0.8333333   0.8595381
## 2         26    30 0.8666667   0.8706489
## 3          0     0       NaN   0.9057988
## 4          1     1 1.0000000   0.9272142
## 5          5     5 1.0000000   0.9618792
## 6          9     9 1.0000000   0.9741775
## Warning: Removed 22 rows containing missing values (geom_point).

## [1] 2.076615
## [1] 0.2007204
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).

##   methylated total      prop eb_estimate
## 1          6     6 1.0000000   0.9757506
## 2         23    30 0.7666667   0.7769109
## 3          0     0       NaN   0.9118617
## 4          1     1 1.0000000   0.9387550
## 5          5     5 1.0000000   0.9724184
## 6          9     9 1.0000000   0.9822014
## Warning: Removed 25 rows containing missing values (geom_point).